LASSO: Least Absolute Shrinkage and Selection Operator
Simulate data
library(data.table)
set.seed(123456)
n <- 1000
dt <- data.table(
p0 = rep(0.2, n)
, or1 = rep(1, n)
, var1 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.3, 0.7))
, or2 = rep(1.1, n)
, var2 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.4, 0.6))
, or3 = rep(1.2, n)
, var3 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.2, 0.8))
, or4 = rep(1.5, n)
, var4 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.3, 0.7))
, or5 = rep(1.7, n)
, var5 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.5, 0.5))
, or6 = rep(2, n)
, var6 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.4, 0.6))
, or7 = rep(5, n)
, var7 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.1, 0.9))
)
dt <- dt[, odds0 := p0 / (1 - p0)
][, log_odds := log(odds0) +
var1 * log(or1) +
var2 * log(or2) +
var3 * log(or3) +
var4 * log(or4) +
var5 * log(or5) +
var6 * log(or6) +
var7 * log(or7)
][, p := exp(log_odds)/ (1 + exp(log_odds))]
vsample <- function(p){
sample(c(1, 0), size = 1, replace = TRUE, prob = c(p, 1 - p))
}
vsample <- Vectorize(vsample)
dt <- dt[, outcome := vsample(p)]
unique(dt[, .(or1, or2, or3, or4, or5, or6, or7)]) %>% prt(caption = "Variables with Odds Ratios")| or1 | or2 | or3 | or4 | or5 | or6 | or7 |
|---|---|---|---|---|---|---|
| 1 | 1.1 | 1.2 | 1.5 | 1.7 | 2 | 5 |
GLM
m <- glm(outcome ~ var1 + var2 + var3 + var4 + var5 + var6 + var7
, data = dt
, family = binomial
)
library(sjPlot)
tab_model(m)| Â | outcome | ||
|---|---|---|---|
| Predictors | Odds Ratios | CI | p |
| (Intercept) | 0.18 | 0.09 – 0.37 | <0.001 |
| var1 | 1.05 | 0.74 – 1.47 | 0.796 |
| var2 | 1.52 | 1.11 – 2.09 | 0.009 |
| var3 | 1.51 | 1.03 – 2.19 | 0.031 |
| var4 | 1.67 | 1.21 – 2.31 | 0.002 |
| var5 | 1.55 | 1.13 – 2.12 | 0.007 |
| var6 | 1.94 | 1.42 – 2.65 | <0.001 |
| var7 | 5.04 | 3.16 – 8.10 | <0.001 |
| Observations | 1000 | ||
| R2 Tjur | 0.097 | ||
glmnet
- \(min_{\beta_0, \beta} \frac{1}{N} \sum_{i=1}^N w_il(y_i, \beta_0 + \beta^Tx_i) + \lambda[(1 - \alpha)||\beta||_2^2 + \alpha||\beta||_1]\)
- L1 penalty: \(\alpha = 1\)
mx <- dt[, .(var1, var2, var3, var4, var5, var6, var7)]
mx <- as.matrix(mx)
## ss <- scale(mmx)
## scale_scale <- attr(ss, 'scaled:scale')
set.seed(123456)
library(glmnet)
lss <- glmnet(x = mx
, y = dt$outcome
, family = "binomial"
, alpha = 1
, standardize = TRUE
)
library(plotmo)
plot_glmnet(lss, xvar = "rlambda")Cross Validation
t1 <- Sys.time()
library(doParallel)
cl <- makePSOCKcluster(6)
registerDoParallel(cl)
set.seed(123456)
cvfit <- cv.glmnet(x = mx
, y = dt$outcome
, type.measure = "auc"
, family = "binomial"
, standardize = TRUE
, parallel = TRUE
)
stopCluster(cl)
t2 <- Sys.time()
print(t2 - t1)Time difference of 1.941566 secs
The Selected Model
glmcoef <- coef(lss, cvfit$lambda.1se)
betas <- data.table(Variable = glmcoef@Dimnames[[1]]
, beta = as.matrix(glmcoef)
)
colnames(betas) <- c("Variable", "beta")
betas <- betas[, OR := exp(beta)]
betas %>% prt(digits = c(0, 6, 6, 6), caption = "Beta Coefficients of the LASSO Selected Model")| Variable | beta | OR |
|---|---|---|
| (Intercept) | 0.225767 | 1.253284 |
| var1 | 0.000000 | 1.000000 |
| var2 | 0.000000 | 1.000000 |
| var3 | 0.000000 | 1.000000 |
| var4 | 0.018369 | 1.018538 |
| var5 | 0.004475 | 1.004485 |
| var6 | 0.155754 | 1.168538 |
| var7 | 1.001301 | 2.721821 |
AUC of The Selected Model
- For plotly output to set chunk option
out.height='200%'
Adaptive LASSO
set.seed(123456)
library(glmnet)
ridge <- glmnet(x = mx
, y = dt$outcome
, family = "binomial"
, alpha = 0
, standardize = TRUE
)
ridge_cv <- cv.glmnet(x = mx
, y = dt$outcome
, alpha = 0
, type.measure = "auc"
, family = "binomial"
, standardize = TRUE
)
best_ridge_coef <- as.numeric(coef(ridge, s = ridge_cv$lambda.1se))[-1]
t1 <- Sys.time()
library(doParallel)
cl <- makePSOCKcluster(6)
registerDoParallel(cl)
set.seed(20210817)
cvfit <- cv.glmnet(x = mx
, y = dt$outcome
, alpha = 1
, penalty.factor = 1 / abs(best_ridge_coef)
, type.measure = "auc"
, family = "binomial"
, standardize=TRUE
, parallel = TRUE
)
stopCluster(cl)
t2 <- Sys.time()
## print(t2 - t1)
lss_adp <- glmnet(x = mx
, y = dt$outcome
, alpha = 1
, penalty.factor = 1 / abs(best_ridge_coef)
, family = "binomial"
, standardize = TRUE
)
cfs <- lapply(cvfit$lambda, function(x){
l <- coef(lss_adp, x)
invisible(l@Dimnames[[1]][l@i + 1])
})
ls <- do.call(c, cfs)
ls <- unique(ls, fromLast = FALSE)[-1]
ls %>% prt(caption = "Order of Variables Selected", col.name = "Variable Name")| Variable Name |
|---|
| var7 |
| var6 |
| var4 |
| var5 |
| var2 |
| var3 |
glmcoef <- coef(lss_adp, cvfit$lambda.1se)
betas <- data.table(Variable = glmcoef@Dimnames[[1]]
, beta = as.matrix(glmcoef)
)
colnames(betas) <- c("Variable", "beta")
betas <- betas[, OR := exp(beta)]
betas %>% prt(digits = c(0, 6, 6, 6), caption = "Beta Coefficients from Adaptive LASSO")| Variable | beta | OR |
|---|---|---|
| (Intercept) | -0.347459 | 0.706481 |
| var1 | 0.000000 | 1.000000 |
| var2 | 0.000000 | 1.000000 |
| var3 | 0.000000 | 1.000000 |
| var4 | 0.116261 | 1.123289 |
| var5 | 0.059358 | 1.061155 |
| var6 | 0.309546 | 1.362806 |
| var7 | 1.451995 | 4.271626 |
R sessionInfo
R version 4.1.2 (2021-11-01) Platform: x86_64-pc-linux-gnu (64-bit) Running under: Ubuntu 20.04.3 LTS
Matrix products: default BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0 LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
locale: [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
[4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
[7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
attached base packages: [1] parallel stats graphics grDevices utils datasets methods
[8] base
other attached packages: [1] pROC_1.18.0 doParallel_1.0.17 iterators_1.0.14
[4] foreach_1.5.2 plotmo_3.6.2 TeachingDemos_2.12 [7] plotrix_3.8-2 Formula_1.2-4 glmnet_4.1-4
[10] sjPlot_2.8.10 Wu_0.0.0.9000 flexdashboard_0.5.2 [13] lme4_1.1-27.1 Matrix_1.4-0 mgcv_1.8-38
[16] nlme_3.1-152 png_0.1-7 scales_1.2.0
[19] nnet_7.3-16 labelled_2.8.0 kableExtra_1.3.4
[22] plotly_4.9.4.1 gridExtra_2.3 ggplot2_3.3.6
[25] DT_0.18 tableone_0.13.0 magrittr_2.0.3
[28] lubridate_1.7.10 dplyr_1.0.9 plyr_1.8.6
[31] data.table_1.14.2 rmdformats_1.0.2 knitr_1.39
loaded via a namespace (and not attached): [1] minqa_1.2.4 colorspace_2.0-3 ellipsis_0.3.2 sjlabelled_1.2.0 [5] estimability_1.3 parameters_0.18.0 rstudioapi_0.13 fansi_1.0.3
[9] mvtnorm_1.1-3 xml2_1.3.3 codetools_0.2-18 splines_4.1.2
[13] sjmisc_2.8.9 jsonlite_1.7.2 nloptr_1.2.2.2 ggeffects_1.1.2
[17] broom_0.8.0 effectsize_0.7.0 compiler_4.1.2 httr_1.4.2
[21] sjstats_0.18.1 emmeans_1.7.4-1 backports_1.2.1 assertthat_0.2.1 [25] fastmap_1.1.0 lazyeval_0.2.2 survey_4.0 cli_3.3.0
[29] htmltools_0.5.2 tools_4.1.2 gtable_0.3.0 glue_1.6.2
[33] Rcpp_1.0.8.3 jquerylib_0.1.4 vctrs_0.4.1 svglite_2.1.0
[37] crosstalk_1.1.1 insight_0.17.1 xfun_0.31 stringr_1.4.0
[41] rvest_1.0.0 lifecycle_1.0.1 klippy_0.0.0.9500 MASS_7.3-54
[45] hms_1.1.0 yaml_2.3.5 sass_0.4.0 stringi_1.7.6
[49] highr_0.9 bayestestR_0.12.1 boot_1.3-28 shape_1.4.6
[53] rlang_1.0.2 pkgconfig_2.0.3 systemfonts_1.0.2 evaluate_0.15
[57] lattice_0.20-45 purrr_0.3.4 htmlwidgets_1.5.4 tidyselect_1.1.2 [61] bookdown_0.22 R6_2.5.1 generics_0.1.2 DBI_1.1.1
[65] pillar_1.7.0 haven_2.4.1 withr_2.5.0 survival_3.2-13
[69] datawizard_0.4.1 tibble_3.1.7 performance_0.9.0 modelr_0.1.8
[73] crayon_1.5.1 utf8_1.2.2 rmarkdown_2.10 grid_4.1.2
[77] forcats_0.5.1 digest_0.6.29 webshot_0.5.2 xtable_1.8-4
[81] tidyr_1.1.3 munsell_0.5.0 viridisLite_0.4.0 bslib_0.2.5.1
[85] mitools_2.4